{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification\n",
    "\n",
    "The goal of classification is to find a function that separates the data into positive/negative labels. In the case of a linear classifier, this reduces to finding a set of parameters $w^\\star$ such that, $$ \\begin{align} w^\\star &= \\arg \\min_w \\sum_{i=1}^{N} \\left[y_i\\neq \\text{sign} (w^\\top x_i) \\right] \\\\ &= \\arg \\min_w \\sum_{i=1}^{N} l_{0/1} (w; x_i, y_i) \\end{align}.$$ \n",
    "\n",
    "The problem with the $l_{0/1}$ loss, is that it is non-convex (and non-differentiable), hence other surrogate losses must be used to optimize the number of points. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Code source: Sebastian Curi and Andreas Krause, based on Jaques Grobler (sklearn demos).\n",
    "# License: BSD 3 clause\n",
    "\n",
    "# We start importing some modules and running some magic commands\n",
    "% matplotlib inline\n",
    "% reload_ext autoreload\n",
    "% load_ext autoreload\n",
    "% autoreload 2\n",
    "\n",
    "# General math and plotting modules.\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Project files.\n",
    "from util import gradient_descent, generate_linear_separable_data, generate_circular_separable_data\n",
    "import plot_helpers\n",
    "from classifiers import Perceptron, SVM, Logistic\n",
    "from regularizers import L1Regularizer, L2Regularizer\n",
    "\n",
    "# Widget and formatting modules\n",
    "import ipywidgets\n",
    "from ipywidgets import interact, interactive, interact_manual\n",
    "import pylab\n",
    "# If in your browser the figures are not nicely vizualized, change the following line. \n",
    "pylab.rcParams['figure.figsize'] = (10, 5)\n",
    "\n",
    "# Machine Learning library. \n",
    "import sklearn\n",
    "from sklearn import svm\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "# from sklearn import datasets, linear_model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_points = 100  # Number of points per class\n",
    "noise = 0.5  # Noise Level (needed for data generation).\n",
    "np.random.seed(42)\n",
    "X, Y = generate_linear_separable_data(num_points, noise=noise, dim=2)\n",
    "# X = X - np.mean(X)\n",
    "\n",
    "fig = plt.subplot(111)\n",
    "opt = {'marker': 'ro', 'label': '+', 'size': 8}\n",
    "plot_helpers.plot_data(X[np.where(Y == 1)[0], 0], X[np.where(Y == 1)[0], 1], fig=fig, options=opt)\n",
    "opt = {'marker': 'bs', 'label': '-', 'x_label': '$x$', 'y_label': '$y$', 'size': 8, 'legend': True}\n",
    "plot_helpers.plot_data(X[np.where(Y == -1)[0], 0], X[np.where(Y == -1)[0], 1], fig=fig, options=opt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Separate into train and test sets!\n",
    "np.random.seed(42)\n",
    "indexes = np.arange(0, 2*num_points, 1)\n",
    "np.random.shuffle(indexes)\n",
    "num_train = int(np.ceil(2*.05*num_points))\n",
    "\n",
    "X_train = X[indexes[:num_train]]\n",
    "Y_train = Y[indexes[:num_train]]\n",
    "\n",
    "X_test = X[indexes[num_train:]]\n",
    "Y_test = Y[indexes[num_train:]]\n",
    "\n",
    "fig = plt.subplot(111)\n",
    "\n",
    "opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}\n",
    "plot_helpers.plot_data(X_train[np.where(Y_train == 1)[0], 0], X_train[np.where(Y_train == 1)[0], 1], fig=fig, options=opt)\n",
    "opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}\n",
    "plot_helpers.plot_data(X_train[np.where(Y_train == -1)[0], 0], X_train[np.where(Y_train == -1)[0], 1], fig=fig, options=opt)\n",
    "\n",
    "opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}\n",
    "plot_helpers.plot_data(X_test[np.where(Y_test == 1)[0], 0], X_test[np.where(Y_test == 1)[0], 1], fig=fig, options=opt)\n",
    "opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8, \n",
    "       'x_label': '$x$', 'y_label': '$y$', 'legend': True}\n",
    "plot_helpers.plot_data(X_test[np.where(Y_test == -1)[0], 0], X_test[np.where(Y_test == -1)[0], 1], fig=fig, options=opt)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Perceptron Algorithm\n",
    "\n",
    "The perceptron loss is defined as: $$L(w; X, Y) =  \\sum_{i=1}^{N} L_p(w; x_i, y_i) = \\sum_{i=1}^{N} \\max \\{ 0, -y_i w^\\top x_i \\}.$$\n",
    "\n",
    "The loss function is continuous, but not differentialbe at $y_i w^\\top x_i=0$. The subgradient, however, exists and hence (stochastic) gradient descent converges. The subgradient is:\n",
    "\n",
    "$$ \\partial L_p(w; x_i,y_i) = \\left\\{\\begin{array}{cc} 0 & \\text{if } -y_i w^\\top x_i < 0 \\\\ -y_i x_i & \\text{if } -y_i w^\\top x_i > 0 \\\\ \\left[0, -y_i x_i \\right] & \\text{if } -y_i w^\\top x_i = 0 \\end{array}  \\right. $$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_learning_params(reg, eta0, n_iter, batch_size):\n",
    "    classifier = Perceptron(X_train, Y_train)\n",
    "    classifier.load_test_data(X_test, Y_test)\n",
    "    \n",
    "    regularizer = L2Regularizer(np.power(10., reg))\n",
    "    np.random.seed(42)\n",
    "    w0 = np.random.randn(3, )\n",
    "\n",
    "    opts = {'eta0': eta0,\n",
    "            'n_iter': n_iter,\n",
    "            'batch_size': batch_size,\n",
    "            'n_samples': X_train.shape[0],\n",
    "            'algorithm': 'SGD',\n",
    "            'learning_rate_scheduling': None,\n",
    "            }\n",
    "    trajectory, indexes = gradient_descent(w0, classifier, regularizer, opts)\n",
    "\n",
    "    contour_plot = plt.subplot(121)\n",
    "    error_plot = plt.subplot(122)\n",
    "    \n",
    "    opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}\n",
    "    plot_helpers.plot_data(X_train[np.where(Y_train == 1)[0], 0], X_train[np.where(Y_train == 1)[0], 1], fig=contour_plot, options=opt)\n",
    "    opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}\n",
    "    plot_helpers.plot_data(X_train[np.where(Y_train == -1)[0], 0], X_train[np.where(Y_train == -1)[0], 1], fig=contour_plot, options=opt)\n",
    "\n",
    "    opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}\n",
    "    plot_helpers.plot_data(X_test[np.where(Y_test == 1)[0], 0], X_test[np.where(Y_test == 1)[0], 1], fig=contour_plot, options=opt)\n",
    "    opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8}\n",
    "    plot_helpers.plot_data(X_test[np.where(Y_test == -1)[0], 0], X_test[np.where(Y_test == -1)[0], 1], fig=contour_plot, options=opt)\n",
    "\n",
    "    contour_opts = {'n_points': 50, 'x_label': '$x$', 'y_label': '$y$', 'sgd_point': True, 'n_classes': 4}\n",
    "    error_opts = {'epoch': 5, 'x_label': '$t$', 'y_label': 'error'}\n",
    "    \n",
    "    opts = {'contour_opts': contour_opts, 'error_opts': error_opts}\n",
    "    plot_helpers.classification_progression(X, Y, trajectory, indexes, classifier, \n",
    "                                            contour_plot=contour_plot, error_plot=error_plot, \n",
    "                                            options=opts)\n",
    "\n",
    "interact_manual(change_learning_params,\n",
    "                reg=ipywidgets.FloatSlider(value=-3,\n",
    "                                    min=-3,\n",
    "                                    max=3,\n",
    "                                    step=0.5,\n",
    "                                    readout_format='.1f',\n",
    "                                    description='Regularization 10^:',\n",
    "                                    style={'description_width': 'initial'},\n",
    "                                    continuous_update=False), \n",
    "                eta0=ipywidgets.FloatSlider(value=1,\n",
    "                                            min=1e-1,\n",
    "                                            max=2,\n",
    "                                            step=1 * 1e-1,\n",
    "                                            readout_format='.1f',\n",
    "                                            description='Learning rate:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False),\n",
    "                n_iter=ipywidgets.IntSlider(value=30,\n",
    "                                            min=5,\n",
    "                                            max=100,\n",
    "                                            step=1,\n",
    "                                            description='Number of iterations:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False),\n",
    "                batch_size=ipywidgets.IntSlider(value=1,\n",
    "                                            min=1,\n",
    "                                            max=X_train.shape[0],\n",
    "                                            step=1,\n",
    "                                            description='Batch Size:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False)\n",
    "               );"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The SVM Algorithm\n",
    "\n",
    "The svm loss is defined as: $$L(w; X, Y) =  \\sum_{i=1}^{N} L_{\\text{svm}} (w; x_i, y_i) =  \\sum_{i=1}^{N} \\max \\{ 0, 1-y_i w^\\top x_i \\}.$$\n",
    "\n",
    "The loss function is continuous, but not differentialbe at $y_i w^\\top x_i=0$. The subgradient, however, exists and hence (stochastic) gradient descent converges. The subgradient is:\n",
    "\n",
    "$$ \\partial L_{\\text{svm}}(w;x_i,y_i) = \\left\\{\\begin{array}{cc} 0 & \\text{if } 1-y_i w^\\top x_i < 0 \\\\ -y_i x_i & \\text{if } 1-y_i w^\\top x_i > 0 \\\\ \\left[0, -y_i x_i \\right] & \\text{if } 1-y_i w^\\top x_i = 0 \\end{array}  \\right. $$\n",
    "\n",
    "The difference with the perceptron loss is that the SVM loss includes a loss margin. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_learning_params(reg, eta0, n_iter, batch_size):\n",
    "    classifier = SVM(X_train, Y_train)\n",
    "    classifier.load_test_data(X_test, Y_test)\n",
    "    \n",
    "    regularizer = L2Regularizer(np.power(10., reg))\n",
    "    np.random.seed(42)\n",
    "    w0 = np.random.randn(3, )\n",
    "\n",
    "    opts = {'eta0': eta0,\n",
    "            'n_iter': n_iter,\n",
    "            'batch_size': batch_size,\n",
    "            'n_samples': X_train.shape[0],\n",
    "            'algorithm': 'SGD',\n",
    "            'learning_rate_scheduling': None\n",
    "            }\n",
    "\n",
    "    trajectory, indexes = gradient_descent(w0, classifier, regularizer, opts)\n",
    "    \n",
    "    contour_plot = plt.subplot(121)\n",
    "    error_plot = plt.subplot(122)\n",
    "    \n",
    "    opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}\n",
    "    plot_helpers.plot_data(X_train[np.where(Y_train == 1)[0], 0], X_train[np.where(Y_train == 1)[0], 1], fig=contour_plot, options=opt)\n",
    "    opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}\n",
    "    plot_helpers.plot_data(X_train[np.where(Y_train == -1)[0], 0], X_train[np.where(Y_train == -1)[0], 1], fig=contour_plot, options=opt)\n",
    "\n",
    "    opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}\n",
    "    plot_helpers.plot_data(X_test[np.where(Y_test == 1)[0], 0], X_test[np.where(Y_test == 1)[0], 1], fig=contour_plot, options=opt)\n",
    "    opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8}\n",
    "    plot_helpers.plot_data(X_test[np.where(Y_test == -1)[0], 0], X_test[np.where(Y_test == -1)[0], 1], fig=contour_plot, options=opt)\n",
    "\n",
    "    contour_opts = {'n_points': 100, 'x_label': '$x$', 'y_label': '$y$', 'sgd_point': True, 'n_classes': 4}\n",
    "    error_opts = {'epoch': 5, 'x_label': '$t$', 'y_label': 'error'}\n",
    "    \n",
    "    opts = {'contour_opts': contour_opts, 'error_opts': error_opts}\n",
    "    plot_helpers.classification_progression(X, Y, trajectory, indexes, classifier, \n",
    "                                            contour_plot=contour_plot, error_plot=error_plot, \n",
    "                                            options=opts)\n",
    "\n",
    "interact_manual(change_learning_params,\n",
    "                reg=ipywidgets.FloatSlider(value=-3,\n",
    "                                    min=-3,\n",
    "                                    max=3,\n",
    "                                    step=0.5,\n",
    "                                    readout_format='.1f',\n",
    "                                    description='Regularization 10^:',\n",
    "                                    style={'description_width': 'initial'},\n",
    "                                    continuous_update=False), \n",
    "                eta0=ipywidgets.FloatSlider(value=1,\n",
    "                                            min=1e-1,\n",
    "                                            max=2,\n",
    "                                            step=1 * 1e-1,\n",
    "                                            readout_format='.1f',\n",
    "                                            description='Learning rate:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False),\n",
    "                n_iter=ipywidgets.IntSlider(value=50,\n",
    "                                            min=5,\n",
    "                                            max=100,\n",
    "                                            step=1,\n",
    "                                            description='Number of iterations:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False),\n",
    "                batch_size=ipywidgets.IntSlider(value=1,\n",
    "                                            min=1,\n",
    "                                            max=X_train.shape[0],\n",
    "                                            step=1,\n",
    "                                            description='Batch Size:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False)\n",
    "               );"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Logistic Regression Algorithm\n",
    "\n",
    "The logistic loss is defined as: $$L(w; X, Y) = \\sum_{i=1}^{N} L_{\\sigma}(w; x_i, y_i) = \\sum_{i=1}^{N} \\log \\left( 1 + \\exp \\left(-y_i w^\\top x_i \\right) \\right) .$$\n",
    "\n",
    "The loss function is differentiable, hence the gradient is:\n",
    "\n",
    "$$ \\frac{\\partial L_{\\sigma}(w; x_i, y_i)}{\\partial w} = -\\frac{\\exp \\left(-y_i w^\\top x_i \\right)}{1+\\exp \\left(-y_i w^\\top x_i \\right)} y x_i$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_learning_params(reg, eta0, n_iter, batch_size):\n",
    "    classifier = Logistic(X_train, Y_train)\n",
    "    classifier.load_test_data(X_test, Y_test)\n",
    "    \n",
    "    regularizer = L2Regularizer(np.power(10., reg))\n",
    "    np.random.seed(42)\n",
    "    w0 = np.random.randn(3, )\n",
    "\n",
    "    opts = {'eta0': eta0,\n",
    "            'n_iter': n_iter,\n",
    "            'batch_size': batch_size,\n",
    "            'n_samples': X_train.shape[0],\n",
    "            'algorithm': 'SGD',\n",
    "            'learning_rate_scheduling': 'AdaGrad'\n",
    "            }\n",
    "\n",
    "    trajectory, indexes = gradient_descent(w0, classifier, regularizer, opts)\n",
    "    \n",
    "    contour_plot = plt.subplot(121)\n",
    "    error_plot = plt.subplot(122)\n",
    "    \n",
    "    opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}\n",
    "    plot_helpers.plot_data(X_train[np.where(Y_train == 1)[0], 0], X_train[np.where(Y_train == 1)[0], 1], fig=contour_plot, options=opt)\n",
    "    opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}\n",
    "    plot_helpers.plot_data(X_train[np.where(Y_train == -1)[0], 0], X_train[np.where(Y_train == -1)[0], 1], fig=contour_plot, options=opt)\n",
    "\n",
    "    opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}\n",
    "    plot_helpers.plot_data(X_test[np.where(Y_test == 1)[0], 0], X_test[np.where(Y_test == 1)[0], 1], fig=contour_plot, options=opt)\n",
    "    opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8}\n",
    "    plot_helpers.plot_data(X_test[np.where(Y_test == -1)[0], 0], X_test[np.where(Y_test == -1)[0], 1], fig=contour_plot, options=opt)\n",
    "\n",
    "    contour_opts = {'n_points': 100, 'x_label': '$x$', 'y_label': '$y$', 'sgd_point': True, 'n_classes': 4}\n",
    "    error_opts = {'epoch': 5, 'x_label': '$t$', 'y_label': 'error'}\n",
    "    \n",
    "    opts = {'contour_opts': contour_opts, 'error_opts': error_opts}\n",
    "    plot_helpers.classification_progression(X, Y, trajectory, indexes, classifier, \n",
    "                                            contour_plot=contour_plot, error_plot=error_plot, \n",
    "                                            options=opts)\n",
    "\n",
    "interact_manual(change_learning_params,\n",
    "                reg=ipywidgets.FloatSlider(value=-3,\n",
    "                                    min=-3,\n",
    "                                    max=3,\n",
    "                                    step=0.5,\n",
    "                                    readout_format='.1f',\n",
    "                                    description='Regularization 10^:',\n",
    "                                    style={'description_width': 'initial'},\n",
    "                                    continuous_update=False), \n",
    "                eta0=ipywidgets.FloatSlider(value=0.5,\n",
    "                                            min=1e-1,\n",
    "                                            max=2,\n",
    "                                            step=1 * 1e-1,\n",
    "                                            readout_format='.1f',\n",
    "                                            description='Learning rate:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False),\n",
    "                n_iter=ipywidgets.IntSlider(value=100,\n",
    "                                            min=5,\n",
    "                                            max=50,\n",
    "                                            step=1,\n",
    "                                            description='Number of iterations:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False),\n",
    "                batch_size=ipywidgets.IntSlider(value=1,\n",
    "                                            min=1,\n",
    "                                            max=X_train.shape[0],\n",
    "                                            step=1,\n",
    "                                            description='Batch Size:',\n",
    "                                            style={'description_width': 'initial'},\n",
    "                                            continuous_update=False)\n",
    "               );"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# L1 Regularization\n",
    "\n",
    "The L-1 regularization method uses a regularizer of the form $R(w) = ||w||_1$ which is non-differentiable. However, the subgradient exits and is:\n",
    "\n",
    "$$\\partial(||w|||_1) = \\left\\{\\begin{array}{cc} \\text{sign} (w)& \\text{if } w \\neq 0 \\\\ [-1, 1]  & \\text{if } w = 0 \\end{array} \\right. $$\n",
    "\n",
    "This regularization method penalizes weights and induces sparsity in the solutions. That is, most of the entries of the solution $w^\\star$ will be zero. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_positive = 50  # Number of points per class\n",
    "num_negative = 50  # Number of points per class\n",
    "\n",
    "noise = 0.3  # Noise Level (needed for data generation).\n",
    "\n",
    "X, Y = generate_linear_separable_data(num_positive, num_negative, offset=np.array([1, .2]), noise=noise, dim=2)\n",
    "X = X - np.mean(X, axis=0)\n",
    "\n",
    "def regularization(regularizer, reg):\n",
    "    np.random.seed(42)\n",
    "    classifier = SGDClassifier(loss='hinge', penalty=regularizer, alpha = np.power(10., reg), random_state=1)\n",
    "    classifier.fit(X[:,:2], Y)\n",
    "    \n",
    "    X0, X1 = X[:, 0], X[:, 1]\n",
    "    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "    h = .02\n",
    "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
    "\n",
    "    fig = plt.subplot(111)\n",
    "    contour = plot_helpers.plot_contours(fig, classifier, xx, yy, cmap=plt.cm.jet, alpha=0.3)\n",
    "    plt.colorbar(contour)\n",
    "    opt = {'marker': 'r*', 'label': '+'}\n",
    "    plot_helpers.plot_data(X[np.where(Y == 1)[0], 0], X[np.where(Y == 1)[0], 1], fig=fig, options=opt)\n",
    "    opt = {'marker': 'bs', 'label': '-', 'legend': True}\n",
    "    plot_helpers.plot_data(X[np.where(Y == -1)[0], 0], X[np.where(Y == -1)[0], 1], fig=fig, options=opt)\n",
    "    \n",
    "interact(regularization,\n",
    "         regularizer=ipywidgets.RadioButtons(\n",
    "             options=['l1', 'l2'],\n",
    "             value='l1',\n",
    "             description='Algorithm:',\n",
    "             style={'description_width': 'initial'}),\n",
    "         reg=ipywidgets.FloatSlider(\n",
    "             value=-3,\n",
    "             min=-3,\n",
    "             max=0,\n",
    "             step=0.5,\n",
    "             description='Regularizer 10^:',\n",
    "             style={'description_width': 'initial'},\n",
    "             continuous_update=False)\n",
    "         );"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Cost Sensitivity\n",
    "\n",
    "When there are more positive than negative examples the then, to overcome this uneven distribution of data, the cost of mislabeling can be changed between classes. \n",
    "\n",
    "The cost can be expressed as:  $$ w^\\star = \\arg \\min_w \\sum_{i=1}^{N} c(y_i) \\left[y_i\\neq \\text{sign} (w^\\top x_i) \\right], $$ \n",
    "where $c(y_i)$ is a cost that depends on the class. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_positive = 20  # Number of points per class\n",
    "num_negative = 100  # Number of points per class\n",
    "noise = 0.3  # Noise Level (needed for data generation).\n",
    "\n",
    "X, Y = generate_linear_separable_data(num_positive, num_negative, noise=noise, dim=2)\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "def cost_sensitivity(algorithm, ratio):\n",
    "    class_weight = {-1: 1, 1:np.power(10., ratio)}\n",
    "    if algorithm == 'SVM':\n",
    "        classifier = LinearSVC(class_weight=class_weight)\n",
    "    elif algorithm == 'Perceptron':\n",
    "        classifier = SGDClassifier(loss='perceptron', random_state=1, class_weight=class_weight)\n",
    "\n",
    "    classifier.fit(X[:,:2], Y)\n",
    "    \n",
    "    X0, X1 = X[:, 0], X[:, 1]\n",
    "    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "    h = .02\n",
    "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
    "\n",
    "    fig = plt.subplot(111)\n",
    "    plot_helpers.plot_contours(fig, classifier, xx, yy, cmap=plt.cm.jet, alpha=0.3)\n",
    "    opt = {'marker': 'r*', 'label': '+'}\n",
    "    plot_helpers.plot_data(X[np.where(Y == 1)[0], 0], X[np.where(Y == 1)[0], 1], fig=fig, options=opt)\n",
    "    opt = {'marker': 'bs', 'label': '-'}\n",
    "    plot_helpers.plot_data(X[np.where(Y == -1)[0], 0], X[np.where(Y == -1)[0], 1], fig=fig, options=opt)\n",
    "\n",
    "    \n",
    "interact(cost_sensitivity,\n",
    "         algorithm=ipywidgets.RadioButtons(\n",
    "             options=['SVM', 'Perceptron'],\n",
    "             value='SVM',\n",
    "             description='Algorithm:',\n",
    "             style={'description_width': 'initial'}),\n",
    "         ratio=ipywidgets.FloatSlider(\n",
    "             value=0,\n",
    "             min=-1,\n",
    "             max=3,\n",
    "             step=0.5,\n",
    "             description='Cost Ratio 10^:',\n",
    "             style={'description_width': 'initial'},\n",
    "             continuous_update=False)\n",
    "         );"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Multiclass Classification\n",
    "\n",
    "In multiclass classification there are two strategies: OvO (One vs One) or OvR (One vs The Rest). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier\n",
    "from sklearn import datasets\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "np.random.seed(1)\n",
    "iris = datasets.load_iris()\n",
    "X, y = iris.data[:,:2], iris.target\n",
    "mean = X.mean(axis=0)\n",
    "std = X.std(axis=0)\n",
    "X = (X - mean) / std\n",
    "\n",
    "def multiclass(strategy):\n",
    "    if strategy == 'OvO':\n",
    "        classifier =  OneVsOneClassifier(SGDClassifier(loss='perceptron', alpha=0.001, n_iter=100, random_state=0))\n",
    "#         classifier =  OneVsOneClassifier(LinearSVC(random_state=0))\n",
    "        colors = ['g', 'r', 'b']\n",
    "    elif strategy == 'OvR' or strategy == 'OvA':\n",
    "        classifier =  OneVsRestClassifier(SGDClassifier(loss='perceptron', alpha=0.001, n_iter=100, random_state=0))\n",
    "#         classifier =  OneVsRestClassifier(LinearSVC(random_state=0))\n",
    "        colors = ['b', 'g', 'r']\n",
    "        \n",
    "\n",
    "    classifier.fit(X, y)\n",
    "    X0, X1 = X[:, 0], X[:, 1]\n",
    "    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
    "    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
    "    h = .02\n",
    "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
    "\n",
    "    fig = plt.subplot(111)\n",
    "    plot_helpers.plot_contours(fig, classifier, xx, yy, cmap=plt.cm.jet, alpha=0.3)\n",
    "    \n",
    "    opt = {'marker': 'bs', 'label': '-', 'x_label': '$x$', 'y_label': '$y$', 'size': 8}\n",
    "    plot_helpers.plot_data(X[np.where(y == 0)[0], 0], X[np.where(y == 0)[0], 1], fig=fig, options=opt)\n",
    "    opt = {'marker': 'g*', 'label': '+', 'size': 8}\n",
    "    plot_helpers.plot_data(X[np.where(y == 1)[0], 0], X[np.where(y == 1)[0], 1], fig=fig, options=opt)\n",
    "    opt = {'marker': 'ro', 'label': '-', 'x_label': '$x$', 'y_label': '$y$', 'size': 8, 'legend': True}\n",
    "    plot_helpers.plot_data(X[np.where(y == 2)[0], 0], X[np.where(y == 2)[0], 1], fig=fig, options=opt)\n",
    "    \n",
    "\n",
    "    def plot_hyperplane(clf, color):\n",
    "        coef = clf.coef_\n",
    "        intercept = clf.intercept_\n",
    "        def line(x0):\n",
    "            return (-(x0 * coef[0, 0]) - intercept[0]) / coef[0, 1]\n",
    "\n",
    "        plt.plot([x_min, x_max], [line(x_min), line(x_max)],\n",
    "                 ls=\"--\", color=color)\n",
    "\n",
    "    \n",
    "    for estimator, color in zip(classifier.estimators_, colors):\n",
    "        plot_hyperplane(estimator, color)\n",
    "\n",
    "    fig.set_xlim([x_min, x_max])\n",
    "    fig.set_ylim([y_min, y_max]);\n",
    "\n",
    "interact(multiclass, strategy=['OvO', 'OvA']); "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}